Packages¶
In [1]:
import numpy as np # scientific computing toolkit
import pandas as pd # data analysis toolkit
import scanpy as sc # scanpy is referred to with sc.***
import matplotlib.pyplot as plt # Matplotlib is referred to with plt.***
from scipy import stats # for linear regressions
import seaborn as sns # for easy heatmaps
import scirpy as ir # TCR analysis
import scvelo as scv
#import mudata as md # New data structure
#from mudata import MuData
sc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header() # check if all needed versions are installed and up to date (old: print_Versions(), changed in new version of scanpy)
results_directory = './Analysis_output/'
results_file = results_directory + '/results_file/write/results.h5ad' # the file that will store the analysis results
Directory variables¶
In [2]:
import glob
import os
from pathlib import Path
sample_directory = './GEX_VDJ/'
analysis_info = './analysis_info/'
adata_directory = './analysis_output/IMMUN_COM/annotated_files/'
In [3]:
plt.rcParams['figure.figsize']=(6,6) #rescale figures
#sc.set_figure_params(dpi=100, dpi_save=400, color_map='viridis', vector_friendly=True, format='eps', ipython_format = 'png2x')
### you can set a default figure directory here for saving output
sc.settings.figdir = "./analysis_output/IMMUN_COM/"
## Figure directory for matplotlib figures/axes
save_figure_sc = "./analysis_output/IMMUN_COM/"
#Use
#plt.savefig(save_figure + 'image.pdf')
save_figure_ir = "./analysis_output/IMMUN_COM/"
#csv directory
read_csv = './analysis_output/IMMUN_COM/'
save_csv = r'./analysis_output/IMMUN_COM/'
Load data¶
In [4]:
# define sample metadata. Usually read from a file.
samples = {
"1_dp_mult": {"group": "mouse_1_20_dp_mult_unstim"},
"2_dp_mult": {"group": "mouse_21_40_dp_mult_unstim"},
"3_dp_mult": {"group": "mouse_41_60_dp_mult_unstim"},
"4_dp_mult": {"group": "mouse_61_80_dp_mult_unstim"},
}
In [5]:
# Create a list of AnnData objects (one for each sample)
adatas = []
for sample, sample_meta in samples.items():
gex_file = glob.glob(f'{sample_directory}/GEX/dp_mult/*{sample}*/outs/filtered_feature_bc_matrix/')[0]
tcr_file = glob.glob(f'{sample_directory}/VDJ/dp_mult/*{sample}*/outs/*annotations.json')[0]
adata = sc.read_10x_mtx(gex_file,
var_names='gene_symbols', # use gene symbols for the variable names (variables-axis index)
cache=True, gex_only=False) # write a cache file for faster subsequent reading
adata_tcr = ir.io.read_10x_vdj(tcr_file)
change_cat = adata_tcr.obs.columns.tolist() #NaN values in VDJ data can mess up analysis for some reason
for i in change_cat: #here all NaN (basically for TCR chain calls) are replaced by 'None'
adata_tcr.obs[i]=adata_tcr.obs[i].astype(str)
adata_tcr.obs[i]=adata_tcr.obs[i].fillna('None')
adata_tcr.obs[i]=adata_tcr.obs[i].replace('nan' ,'None')
adata_tcr.obs[i]=adata_tcr.obs[i].astype('category')
ir.pp.merge_with_ir(adata, adata_tcr)
adata.obs["experiment"] = sample
adata.obs["group"] = sample_meta["group"]
# concatenation only works with unique gene names
adata.var_names_make_unique()
adatas.append(adata)
... reading from cache file cache\GEX_VDJ-GEX-dp_mult-1_dp_mult_GEX_1-outs-filtered_feature_bc_matrix-matrix.h5ad WARNING: Non-standard locus name ignored: None ... reading from cache file cache\GEX_VDJ-GEX-dp_mult-2_dp_mult_GEX_2-outs-filtered_feature_bc_matrix-matrix.h5ad WARNING: Non-standard locus name ignored: None ... reading from cache file cache\GEX_VDJ-GEX-dp_mult-3_dp_mult_GEX_3-outs-filtered_feature_bc_matrix-matrix.h5ad WARNING: Non-standard locus name ignored: None ... reading from cache file cache\GEX_VDJ-GEX-dp_mult-4_dp_mult_GEX_4-outs-filtered_feature_bc_matrix-matrix.h5ad WARNING: Non-standard locus name ignored: None
Preprocessing¶
Hashtag Antibodies¶
Highest expressed genes in all samples and quality check for hashtag antibody expression - Show those genes that yield the highest fraction of counts in each single cells, across all cells.
In [6]:
nrows = len(adatas)//2
ncols = 2
fig, ax = plt.subplots(nrows=nrows, ncols=ncols, figsize=(12,3*len(adatas)))
index = 0
axis = 0
for i, adata in enumerate(adatas):
if index <ncols:
sc.pl.highest_expr_genes(adata, n_top=20, ax=ax[axis,index], show=False)
ax[axis,index].set_title(samples[adata.obs.experiment[0]])
index = index + 1
else:
axis = axis + 1
sc.pl.highest_expr_genes(adata, n_top=20, ax=ax[axis,0], show=False)
ax[axis,0].set_title(samples[adata.obs.experiment[0]])
index = 1
fig.tight_layout()
plt.show()
fig.savefig(save_figure_sc + 'highest_expr_genes_dp_mult.pdf')
normalizing counts per cell
finished (0:00:00)
normalizing counts per cell
finished (0:00:00)
normalizing counts per cell
finished (0:00:00)
normalizing counts per cell
finished (0:00:00)
Basic filter¶
In the next step we will apply some bacic filters to these cells. Therefore we can control for the minimum number of genes per cell, to take this cell into account. A second filter will filter out genes, which are present in less than a certain number of cells.
In [7]:
for i, adata in enumerate(adatas):
print(adatas[i].obs.experiment[0] + ' :Total number of cells: {:d}'.format(adatas[i].n_obs))
sc.pp.filter_cells(adatas[i], min_genes=200)
sc.pp.filter_genes(adata[i], min_cells=3)
print(adatas[i].obs.experiment[0] +': Total number of cells after filtering: {:d}'.format(adatas[i].n_obs))
1_dp_mult :Total number of cells: 15768 filtered out 12 cells that have less than 200 genes expressed filtered out 32297 genes that are detected in less than 3 cells 1_dp_mult: Total number of cells after filtering: 15756 2_dp_mult :Total number of cells: 24934 filtered out 13 cells that have less than 200 genes expressed filtered out 32297 genes that are detected in less than 3 cells 2_dp_mult: Total number of cells after filtering: 24921 3_dp_mult :Total number of cells: 27965 filtered out 20 cells that have less than 200 genes expressed filtered out 32297 genes that are detected in less than 3 cells 3_dp_mult: Total number of cells after filtering: 27945 4_dp_mult :Total number of cells: 19428 filtered out 14 cells that have less than 200 genes expressed filtered out 32297 genes that are detected in less than 3 cells 4_dp_mult: Total number of cells after filtering: 19414
Annotation of data¶
In [8]:
bc_group = ['CD3', 'CD90']
bc_m = ['BC1','BC2','BC3','BC4','BC5','BC6','BC7','BC8','BC9','BC10']
bc_all = bc_group + bc_m
bc_m_HTO = []
for i in bc_m:
bc_m_HTO.append(i + '_HTO')
In [9]:
adatas_HTO = [0]*len(adatas)
for i, adata in enumerate(adatas):
adatas_HTO[i]=adata[:, adata.var_names.isin(bc_all)]
HashSolo¶
Normalize Barcode expression for plotting after running HashSolo
In [10]:
def clr_normalize_each_cell(adata, inplace=True):
"""Normalize count vector for each cell, i.e. for each row of .X"""
import numpy as np
import scipy
def seurat_clr(x):
# TODO: support sparseness
s = np.sum(np.log1p(x[x > 0]))
exp = np.exp(s / len(x))
return np.log1p(x / exp)
if not inplace:
adata = adata.copy()
# apply to dense or sparse matrix, along axis. returns dense matrix
adata.X = np.apply_along_axis(
seurat_clr, 1, (adata.X.toarray() if scipy.sparse.issparse(adata.X) else adata.X)
)
return adata
In [11]:
fig, ax = plt.subplots(ncols=4, figsize=(16,4))
for i, adata in enumerate(adatas_HTO):
#clr_normalize_each_cell(adata)
scv.pl.scatter(adata, x='CD3', y='CD90', show = False, ax = ax[i],xlim = [-1,250], ylim= [-1,5000])
fig.tight_layout()
plt.show()
Remove doublets
In [12]:
cutoff_list_cd3 = [15,15,15,10]
In [13]:
for i, adata in enumerate(adatas_HTO):
cd90 = (adata[: , 'CD90'].X>300).todense()
cd3 = (adata[: , 'CD3'].X>cutoff_list_cd3[i]).todense()
df = pd.DataFrame(cd3, columns=['CD3'])
df['CD90'] = cd90
adatas_HTO[i] = adata[df.sum(axis=1)<2].copy()
In [14]:
fig, ax = plt.subplots(ncols=4, figsize=(16,4))
for i, adata in enumerate(adatas_HTO):
scv.pl.scatter(adata, x='CD3', y='CD90', show = False, ax = ax[i], xlim = [-1,250], ylim= [-1,5000])
fig.tight_layout()
plt.show()
Annotate samples by barcode
In [15]:
for i, adata in enumerate(adatas_HTO):
adatas_HTO[i].obs['group_bc'] = 'None'
#Define cutoffs for Barcode expression
cd90 = (adata[: , 'CD90'].X>400).todense()
cd3 = (adata[: , 'CD3'].X>18).todense()
#make Dataframe from adata.X CD3 and CD90
df = pd.DataFrame(cd3, columns=['cd3'])
df['cd90'] = cd90
#Rename values in Dataframe
df.cd3[df.cd3==True]='CD3'
df.cd3[df.cd3==False]='None'
df.cd90[df.cd90==True]='CD90'
df.cd90[df.cd90==False]='None'
#Combine CD3 and CD90 column into one column
df['bc'] = df.cd3 + df.cd90
#Rename combined column so it makes sense
df.bc[df.bc=='NoneCD90'] = 'CD90'
df.bc[df.bc=='CD3None'] = 'CD3'
df.bc[df.bc=='NoneNone'] = 'None'
bc_list = df.bc.tolist()
#Annotate adatas
adatas_HTO[i].obs.group_bc=bc_list
In [16]:
fig, ax = plt.subplots(ncols=4, figsize=(16,4))
for i, adata in enumerate(adatas_HTO):
scv.pl.scatter(adata, x='CD3', y='CD90', color='group_bc', legend_loc='right_margin', show = False, ax = ax[i], xlim = [-1,150], ylim= [-1,2500])
fig.tight_layout()
plt.show()
In [17]:
for i, adata in enumerate(adatas_HTO):
adatas_HTO[i] = adata[:, adata.var_names.isin(bc_m)].copy()
Run rest of HashSolo
In [18]:
adatas_hashsolo = []
for i, adata in enumerate(adatas_HTO):
HTO_data=adata.copy()
HTO_data.layers["counts"] = HTO_data.X.copy()
HTO_counts=pd.DataFrame(HTO_data.X.todense(), columns=HTO_data.var_names+'_HTO', index=HTO_data.obs_names)
HTO_data.obs=pd.concat([HTO_data.obs,HTO_counts], axis=1)
sc.external.pp.hashsolo(HTO_data,cell_hashing_columns=bc_m_HTO)
adatas_hashsolo.append(HTO_data)
clr_normalize_each_cell(HTO_data)
sc.pp.log1p(HTO_data)
sc.pp.scale(HTO_data)
sc.pp.pca(HTO_data)
sc.pp.neighbors(HTO_data, n_neighbors=50)
sc.tl.leiden(HTO_data, key_added="HTO_leiden", resolution=.1)
#sc.tl.umap(HTO_data)
sc.tl.tsne(HTO_data)
#sc.pl.umap(HTO_data, color=HTO_data.var_names.tolist()+['singlet_hypothesis_probability', 'doublet_hypothesis_probability', 'Classification','HTO_leiden'], ncols=10, save='_HTO.pdf')
sc.pl.tsne(HTO_data, color=HTO_data.var_names.tolist()+['singlet_hypothesis_probability', 'doublet_hypothesis_probability', 'Classification','HTO_leiden'], ncols=5, save=str(adatas_HTO[i].obs.experiment[0])+'_HTO.pdf')
#sc.pl.pca(HTO_data, color=HTO_data.var_names.tolist()+['singlet_hypothesis_probability', 'doublet_hypothesis_probability', 'Classification'], ncols=5, save=str(adatas_HTO_temp[i].obs.experiment[0])+'_HTO.pdf')
Please cite HashSolo paper:
https://www.cell.com/cell-systems/fulltext/S2405-4712(20)30195-2
computing PCA
with n_comps=9
finished (0:00:00)
computing neighbors
using data matrix X directly
finished: added to `.uns['neighbors']`
`.obsp['distances']`, distances for each pair of neighbors
`.obsp['connectivities']`, weighted adjacency matrix (0:01:06)
running Leiden clustering
finished: found 11 clusters and added
'HTO_leiden', the cluster labels (adata.obs, categorical) (0:00:02)
computing tSNE
using data matrix X directly
using sklearn.manifold.TSNE
finished: added
'X_tsne', tSNE coordinates (adata.obsm)
'tsne', tSNE parameters (adata.uns) (0:00:36)
WARNING: saving figure to file analysis_output\IMMUN_COM\tsne1_dp_mult_HTO.pdf
Please cite HashSolo paper:
https://www.cell.com/cell-systems/fulltext/S2405-4712(20)30195-2
computing PCA
with n_comps=9
finished (0:00:00)
computing neighbors
using data matrix X directly
finished: added to `.uns['neighbors']`
`.obsp['distances']`, distances for each pair of neighbors
`.obsp['connectivities']`, weighted adjacency matrix (0:00:07)
running Leiden clustering
finished: found 12 clusters and added
'HTO_leiden', the cluster labels (adata.obs, categorical) (0:00:04)
computing tSNE
using data matrix X directly
using sklearn.manifold.TSNE
finished: added
'X_tsne', tSNE coordinates (adata.obsm)
'tsne', tSNE parameters (adata.uns) (0:00:59)
WARNING: saving figure to file analysis_output\IMMUN_COM\tsne2_dp_mult_HTO.pdf
Please cite HashSolo paper:
https://www.cell.com/cell-systems/fulltext/S2405-4712(20)30195-2
computing PCA
with n_comps=9
finished (0:00:00)
computing neighbors
using data matrix X directly
finished: added to `.uns['neighbors']`
`.obsp['distances']`, distances for each pair of neighbors
`.obsp['connectivities']`, weighted adjacency matrix (0:00:05)
running Leiden clustering
finished: found 13 clusters and added
'HTO_leiden', the cluster labels (adata.obs, categorical) (0:00:03)
computing tSNE
using data matrix X directly
using sklearn.manifold.TSNE
finished: added
'X_tsne', tSNE coordinates (adata.obsm)
'tsne', tSNE parameters (adata.uns) (0:00:48)
WARNING: saving figure to file analysis_output\IMMUN_COM\tsne3_dp_mult_HTO.pdf
Please cite HashSolo paper:
https://www.cell.com/cell-systems/fulltext/S2405-4712(20)30195-2
computing PCA
with n_comps=9
finished (0:00:00)
computing neighbors
using data matrix X directly
finished: added to `.uns['neighbors']`
`.obsp['distances']`, distances for each pair of neighbors
`.obsp['connectivities']`, weighted adjacency matrix (0:00:03)
running Leiden clustering
finished: found 14 clusters and added
'HTO_leiden', the cluster labels (adata.obs, categorical) (0:00:01)
computing tSNE
using data matrix X directly
using sklearn.manifold.TSNE
finished: added
'X_tsne', tSNE coordinates (adata.obsm)
'tsne', tSNE parameters (adata.uns) (0:00:30)
WARNING: saving figure to file analysis_output\IMMUN_COM\tsne4_dp_mult_HTO.pdf